library(Seurat)
library(tidyverse)
library(VennDiagram)
library(clusterProfiler)
library(org.Hs.eg.db)
dat <- readRDS("pseudobulk/DE_data.Rds")
dat <- SetIdent(dat, value="label")
dat <- NormalizeData(dat)
All the results were filtered by the adj.p-value < 0.05 and logFC >= 0.3
dat
## An object of class Seurat
## 15327 features across 6961 samples within 1 assay
## Active assay: RNA (15327 features, 0 variable features)
## 2 dimensional reductions calculated: umap, pca
For single cell methods I am going to consider both Seurat and Scanpy
For Scanpy I take in account the methods: - t-test_overestim_var - wilcoxon
Unfortunately the logistic regression provided by scanpy does not give LogFoldChanges and will not be taken in account.
scanpy_t = read.csv("scanpy/DEG_adata_t.csv")
scanpy_t$X <- NULL
scanpy_wx <- read.csv("scanpy/DEG_adata_wx.csv")
scanpy_wx$X <- NULL
num_genes <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(num_genes) <- c("Method", "nGenes")
newrow <- c("scanpy_t", dim(scanpy_t)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
newrow <- c("scanpy_wx", dim(scanpy_wx)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes <- function(num_genes){
num_genes$nGenes = as.integer(num_genes$nGenes)
ggplot(num_genes, aes(x=Method, y=nGenes, fill=Method))+
geom_bar(stat="identity",width = 0.5)+
geom_text(aes(label=nGenes), vjust=1.6, color="white", size=3.5)+
theme_minimal()
}
plot_genes(num_genes)
grid.newpage()
venn_object <- venn.diagram(
x = list(scanpy_t$Symbol, scanpy_wx$Symbol),
category.names = c("scanpy_t", "scanpy_wx"),
filename = NULL
)
grid.draw(venn_object)
The t-test_overestim_var recognized as significative almost all the genes and include almost all the genes from the wilcoxon test. Nolw let’s evaluate the expression of the genes.
Top upregulated genes in Erythrocytes:
plot_top_genes <- function(data, column){
max_lfc <- data[order(-data[column]),] %>%
head(3)
max_lfc <- max_lfc$Symbol
RidgePlot(dat, features = max_lfc)
}
plot_top_genes(scanpy_t,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0665
## Picking joint bandwidth of 0.0505
## Picking joint bandwidth of 0.0478
Top downregulated genes in Erythrocytes:
plot_bottom_genes <- function(data, column){
max_lfc <- data[order(-data[column]),] %>%
tail(3)
max_lfc <- max_lfc$Symbol
RidgePlot(dat, features = max_lfc)
}
plot_bottom_genes(scanpy_t,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0478
## Picking joint bandwidth of 0.0485
## Picking joint bandwidth of 0.069
Genes with lower abs(logFC)
plot_low_genes <- function(data,column){
max_lfc <- data[order(-data[column]),] %>%
tail(3)
max_lfc <- max_lfc$Symbol
RidgePlot(dat, features = max_lfc)
}
plot_low_genes(scanpy_t,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.144
## Picking joint bandwidth of 1.98e-06
## Picking joint bandwidth of 1.98e-06
The result’s in terms of LogFC looks really odd. Let’s see the values for this genes:
scanpy_t[scanpy_t$Symbol %in% c("KRT1","ARG1", "LINC00570", "CD72", "ARPP21", "MME"),]
## Symbol pvals_adj logfoldchanges abs_lfc scores
## 68 KRT1 0.000000e+00 29.73123 29.73123 48.64429
## 113 LINC00570 3.816327e-227 28.53994 28.53994 35.04078
## 131 ARG1 2.312735e-192 28.81118 28.81118 31.80785
## 13375 ARPP21 1.717660e-226 -29.04441 29.04441 -34.57604
## 13480 CD72 1.808806e-242 -28.93440 28.93440 -35.96811
## 13887 MME 0.000000e+00 -29.52419 29.52419 -44.01670
## Aberrant.erythroid Pro.B.cells
## 68 0.4846416 0.0000000
## 113 0.3146137 0.0000000
## 131 0.2832765 0.0000000
## 13375 0.0000000 0.2891921
## 13480 0.0000000 0.3130016
## 13887 0.0000000 0.4122525
Apparently Scanpy has a bias evaluating the logFC of
genes that are completely missing in one of the conditions.
But The genes are correctly plotted in the report. So I will check the expression by
the z-score instead of the LogFC:
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(scanpy_t,"scores")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.05
## Picking joint bandwidth of 0.089
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(scanpy_t,"scores")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0755
## Picking joint bandwidth of 0.035
## Picking joint bandwidth of 0.0609
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
max_lfc <- scanpy_t[order(-abs(scanpy_t["scores"])),] %>%
tail(3)
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
max_lfc <- max_lfc$Symbol
RidgePlot(dat, features = max_lfc)
## Picking joint bandwidth of 1.78e-06
## Picking joint bandwidth of 1.78e-06
## Picking joint bandwidth of 1.78e-06
Better result for the extremes logFC (scores), bt still not clear results for the low scored genes. Indeed those genes are expressed by less than 1% of cells:
scanpy_t[scanpy_t$Symbol %in% max_lfc,]
## Symbol pvals_adj logfoldchanges abs_lfc scores Aberrant.erythroid
## 816 MIR133A1HG 0.04988015 -2.444078 2.444078 -1.980822 0.0006205399
## 817 UACA 0.04972146 -1.606474 1.606474 -1.982030 0.0009308098
## 818 CEP170B 0.04952603 -2.087353 2.087353 -1.983738 0.0003102699
## Pro.B.cells
## 816 0.002140182
## 817 0.005082932
## 818 0.003477796
plot_upGOs <- function(data,column){
upgenes <- data[data[column] > 0,] %>%
dplyr::select(Symbol) %>%
pull()
upGO <- enrichGO(upgenes, org.Hs.eg.db, keyType = "SYMBOL", ont="BP", pAdjustMethod = "BH", readable="False")
upGO <- simplify(upGO, cutoff=0.7, by="p.adjust", select_fun = min,measure = "Wang")
print("Upregulated GO terms")
dotplot(upGO)
}
plot_downGOs <- function(data,column){
downgenes <- data[data[column] < 0,] %>%
dplyr::select(Symbol) %>%
pull()
downGO <- enrichGO(downgenes, org.Hs.eg.db, keyType = "SYMBOL", ont="BP", pAdjustMethod = "BH", readable="False")
downGO <- simplify(downGO, cutoff=0.7, by="p.adjust", select_fun = min,measure = "Wang")
print("Downregulated GO terms")
dotplot(downGO)
}
plot_upGOs(scanpy_t, "scores")
## [1] "Upregulated GO terms"
plot_downGOs(scanpy_t, "scores")
## [1] "Downregulated GO terms"
Few up terms can be linked to aberrant erythroid cells, down terms looks slightly more specific for the condition of pro0B cells
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(scanpy_wx,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0665
## Picking joint bandwidth of 0.0505
## Picking joint bandwidth of 0.0478
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(scanpy_wx,"logfoldchanges")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0478
## Picking joint bandwidth of 0.0485
## Picking joint bandwidth of 0.069
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(scanpy_wx,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.51e-06
## Picking joint bandwidth of 0.0327
## Picking joint bandwidth of 1.98e-06
plot_upGOs(scanpy_wx, "scores")
## [1] "Upregulated GO terms"
plot_downGOs(scanpy_wx, "scores")
## [1] "Downregulated GO terms"
Similar result with wilcoxon, slightly better GO terms
For Seurat I take in account the methods: - t-test - wilcoxon - logistic regression
seurat_t <- read_csv("seurat/t-test_seurat.csv")%>%
filter(cluster == "Aberrant erythroid")
## Rows: 8142 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cluster, gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, abs_lfc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(seurat_t)[names(seurat_t) == 'gene'] <- 'Symbol'
newrow <- c("seurat_t", dim(seurat_t)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
seurat_wx <- read_csv("seurat/wilcoxon_seurat.csv")%>%
filter(cluster == "Aberrant erythroid")
## Rows: 8134 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cluster, gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, abs_lfc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(seurat_wx)[names(seurat_wx) == 'gene'] <- 'Symbol'
newrow <- c("seurat_wx", dim(seurat_wx)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
seurat_logreg <- read_csv("seurat/wilcoxon_seurat.csv")%>%
filter(cluster == "Aberrant erythroid")
## Rows: 8134 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): cluster, gene
## dbl (6): p_val, avg_log2FC, pct.1, pct.2, p_val_adj, abs_lfc
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(seurat_logreg)[names(seurat_logreg) == 'gene'] <- 'Symbol'
newrow <- c("seurat_logreg", dim(seurat_logreg)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes(num_genes)
grid.newpage()
venn_object <- venn.diagram(
x = list(seurat_t$Symbol, seurat_wx$Symbol, seurat_logreg$Symbol),
category.names = c("seurat_t", "seurat_wx", "seurat_logreg"),
filename = NULL
)
grid.draw(venn_object)
Similar numbers of DE genes were found by Seurat’s different flavours.
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(seurat_t,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(seurat_t,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.0609
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(seurat_t,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.03e-06
## Picking joint bandwidth of 2.06e-06
## Picking joint bandwidth of 1.95e-06
plot_upGOs(seurat_t, "avg_log2FC")
## [1] "Upregulated GO terms"
plot_downGOs(seurat_t, "avg_log2FC")
## [1] "Downregulated GO terms"
Better result with seurat, the top upregulated genes are strictly correlated with erythrocytes. Only CD74 is downregulated and correlated with immune cells. The low abs(LFC) genes still are not well evaluable. In average the different gene expression is better appreciable
GO terms are also better compared to scanpy
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(seurat_wx,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(seurat_wx,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.0609
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(seurat_wx,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.03e-06
## Picking joint bandwidth of 2.06e-06
## Picking joint bandwidth of 1.95e-06
Similar to the previous
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(seurat_logreg,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(seurat_logreg,"avg_log2FC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.12
## Picking joint bandwidth of 0.084
## Picking joint bandwidth of 0.0609
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(seurat_logreg,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.03e-06
## Picking joint bandwidth of 2.06e-06
## Picking joint bandwidth of 1.95e-06
As above.
Now let’s see the genes fuond uniquely by logreg/wx and t-test:
Unique in wx/logeg:
unique_wx = setdiff(seurat_logreg$Symbol,seurat_t$Symbol)
seurat_logreg[seurat_logreg$Symbol %in% unique_wx,]
## # A tibble: 1 × 8
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster Symbol abs_lfc
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1.55e-11 0.319 0.289 0.407 0.000000237 Aberrant erythroid NEAT1 0.319
RidgePlot(dat, features = unique_wx)
## Picking joint bandwidth of 0.139
VlnPlot(dat, features = unique_wx, group.by = "bioprojects", split.by = "label")
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
## Warning: Groups with fewer than two data points have been dropped.
NEAT1 result upregulated in Aberrant erythroid cells only with wilcoxon/logreg method. The logFC are quite small (0.32) and the distribution is similar between the cell lines. Looking at the expression divided by the batch, no batch specific differential expression is available.
Unique in t-test:
unique_t = setdiff(seurat_t$Symbol,seurat_logreg$Symbol)
seurat_t[seurat_t$Symbol %in% unique_t,]
## # A tibble: 5 × 8
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster Symbol abs_lfc
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1.11e-15 0.469 0.237 0.216 1.70e-11 Aberrant erythroid RNF19A 0.469
## 2 2.33e- 7 0.432 0.24 0.284 3.58e- 3 Aberrant erythroid MGST3 0.432
## 3 3.87e-12 0.311 0.207 0.175 5.93e- 8 Aberrant erythroid ZNF451 0.311
## 4 2.10e- 6 -0.313 0.398 0.391 3.22e- 2 Aberrant erythroid ODC1 0.313
## 5 2.39e- 7 -0.520 0.483 0.457 3.67e- 3 Aberrant erythroid REXO2 0.520
RidgePlot(dat, features = unique_t)
## Picking joint bandwidth of 3.07e-06
## Picking joint bandwidth of 0.0324
## Picking joint bandwidth of 2.6e-06
## Picking joint bandwidth of 0.13
## Picking joint bandwidth of 0.144
VlnPlot(dat, features = unique_t, group.by = "bioprojects", split.by = "label", ncol = 2)
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
All the genes doesn’t have a clear difference in the expression among the batches, I will drop the seurat t-test and keep the logreg/wx
In the pseudobulks, counts were summed for each celltype and sample, in practice obtaining a column of counts for each celltype and sample. Then, genes that weren’t showing counts in more than 90% of the cells were removed. Also, genes showing expression below 10 counts in all the pseudobulks were removed (pseudobulks are the sum of cells counts. showing less than 10 counts indicate almost all the cells didn’t had count’s for that gene).
pseudobulk = read_csv("pseudobulk/Pseudobulk_clean.csv")
## Rows: 11505 Columns: 121
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Symbol
## dbl (120): Aberrant_erythroid_MGUS_CD138nCD45p_2, Aberrant_erythroid_MGUS_CD...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(pseudobulk)
## [1] 11505 121
For DESeq2, firstly I run it with the “LRT” methods that showed better results here. The PCA plot looks good but the clustering mix some samples here. I run DESeq2 specifing the model to take account of the batches.
Then, I tested combat_seq to correct the batch effect here. Also, I decided to see
if there was a batch effect afrom unknown sources using the single
variable analysis (sva) here.
deseq2 <- read_csv("pseudobulk/DESeq2.csv") %>%
mutate(abs_lfc = abs(log2FoldChange))
## Rows: 1945 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(deseq2)[names(deseq2) == 'gene'] <- 'Symbol'
newrow <- c("deseq2", dim(deseq2)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
deseq2_combat <- read_csv("pseudobulk/DESeq2_combat.csv") %>%
mutate(abs_lfc = abs(log2FoldChange))
## Rows: 9235 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(deseq2_combat)[names(deseq2_combat) == 'gene'] <- 'Symbol'
newrow <- c("deseq2_combat", dim(deseq2_combat)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
deseq2_sva <- read_csv("pseudobulk/DESeq2_sva.csv") %>%
mutate(abs_lfc = abs(log2FoldChange))
## Rows: 4216 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gene
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(deseq2_sva)[names(deseq2_sva) == 'gene'] <- 'Symbol'
newrow <- c("deseq2_sva", dim(deseq2_sva)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
num_genes$nGenes = as.integer(num_genes$nGenes)
plot_genes(num_genes)
grid.newpage()
venn_object <- venn.diagram(
x = list(deseq2$Symbol, deseq2_combat$Symbol, deseq2_sva$Symbol),
category.names = c("deseq2", "deseq2_combat","deseq2_sva"),
filename = NULL
)
grid.draw(venn_object)
DESeq2 without batch correction found the least number of DE genes
compared to the methods tested until now. Also, the DEgenes found with
batch corrected methods differs from DESeq2.
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(deseq2,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
## Picking joint bandwidth of 0.136
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(deseq2,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0609
## Picking joint bandwidth of 0.0838
## Picking joint bandwidth of 0.103
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(deseq2,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.175
## Picking joint bandwidth of 0.0557
## Picking joint bandwidth of 0.0568
plot_upGOs(deseq2, "log2FoldChange")
## [1] "Upregulated GO terms"
plot_downGOs(deseq2, "log2FoldChange")
## [1] "Downregulated GO terms"
Genes with high and low logFC are well divided. Also, the genes with low abs(logDC) show better division between the cell types. BP terms aren’t really specific.
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(deseq2_combat,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(deseq2_combat,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.106
## Picking joint bandwidth of 0.103
## Picking joint bandwidth of 0.121
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(deseq2_combat,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0838
## Picking joint bandwidth of 0.066
## Picking joint bandwidth of 0.0725
plot_upGOs(deseq2_combat, "log2FoldChange")
## [1] "Upregulated GO terms"
plot_downGOs(deseq2_combat, "log2FoldChange")
## [1] "Downregulated GO terms"
Genes with high and low logFC are well divided. Also, the genes with low abs(logDC) show better division between the cell types. BP terms aren’t really specific.
BP terms aren’t really specific. read this for vescicole and autophagy.
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(deseq2_sva,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 1.92e-06
## Picking joint bandwidth of 0.098
## Picking joint bandwidth of 0.136
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(deseq2_sva,"log2FoldChange")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 1.95e-06
## Picking joint bandwidth of 0.0277
## Picking joint bandwidth of 1.81e-06
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(deseq2_sva,"abs_lfc")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0452
## Picking joint bandwidth of 0.0349
## Picking joint bandwidth of 2.26e-06
plot_upGOs(deseq2_sva, "log2FoldChange")
## [1] "Upregulated GO terms"
plot_downGOs(deseq2_sva, "log2FoldChange")
## [1] "Downregulated GO terms"
Despite both found ~4k DE genes, only a partial overlap of DE genes between DESeq2 + sva and Seurat. Also, the top lgFC genes have really poor expression.
Normal and with combat
limma <- read_csv("pseudobulk/limma_voom.csv")
## New names:
## * `` -> ...1
## Rows: 7580 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (7): logFC, AveExpr, t, P.Value, adj.P.Val, B, absFLC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(limma)[names(limma) == '...1'] <- 'Symbol'
newrow <- c("limma", dim(limma)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
limma_combat <- read_csv("pseudobulk/limma_combat.csv")
## New names:
## * `` -> ...1
## Rows: 7580 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (7): logFC, AveExpr, t, P.Value, adj.P.Val, B, absFLC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(limma_combat)[names(limma_combat) == '...1'] <- 'Symbol'
newrow <- c("limma_combat", dim(limma_combat)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes(num_genes)
grid.newpage()
venn_object <- venn.diagram(
x = list(limma$Symbol, limma_combat$Symbol),
category.names = c("limma", "limma_combat"),
filename = NULL
)
grid.draw(venn_object)
Limma found more than 7k genes, no differences with the batch corrected data
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(limma,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(limma,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.106
## Picking joint bandwidth of 0.103
## Picking joint bandwidth of 0.0609
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(limma,"absFLC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 2.34e-06
## Picking joint bandwidth of 2.04e-06
## Picking joint bandwidth of 2.2e-06
plot_upGOs(limma, "logFC")
## [1] "Upregulated GO terms"
plot_downGOs(limma, "logFC")
## [1] "Downregulated GO terms"
High logFC are clear, low abs(logFC) aren’t clear Read this and this for cilia
Finally I tested edgeR with LRT as best option from here, I tested combat too. edgeR edgeR_combat
edger <- read_csv("pseudobulk/edgeR.csv")
## New names:
## * `` -> ...1
## Rows: 9415 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, genes
## dbl (6): logFC, logCPM, LR, PValue, FDR, absLFC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(edger)[names(edger) == 'genes'] <- 'Symbol'
newrow <- c("edger", dim(edger)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
edger_combat <- read_csv("pseudobulk/edgeR_combat.csv")
## New names:
## * `` -> ...1
## Rows: 9163 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): ...1, genes
## dbl (6): logFC, logCPM, LR, PValue, FDR, absLFC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(edger_combat)[names(edger_combat) == 'genes'] <- 'Symbol'
newrow <- c("edger_combat", dim(edger_combat)[1])
num_genes[nrow(num_genes) + 1, ] <- newrow
plot_genes(num_genes)
grid.newpage()
venn_object <- venn.diagram(
x = list(edger$Symbol, edger_combat$Symbol),
category.names = c("edgeR", "edgeR_combat"),
filename = NULL
)
grid.draw(venn_object)
Both the methods found more than 9k genes with some hundreds uniquel per each method.
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(edger,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.098
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 2.06e-06
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(edger,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0977
## Picking joint bandwidth of 0.108
## Picking joint bandwidth of 0.121
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(edger,"absLFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0325
## Picking joint bandwidth of 0.0525
## Picking joint bandwidth of 0.127
plot_upGOs(edger, "logFC")
## [1] "Upregulated GO terms"
plot_downGOs(edger, "logFC")
## [1] "Downregulated GO terms"
Extreme LogFC show different results from what obtained with other algorithms, maybe because of the hand-written method. Low abs(logFC) are not appreciable. BP terms looks good
print("Top upregulated genes")
## [1] "Top upregulated genes"
plot_top_genes(edger_combat,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0407
## Picking joint bandwidth of 0.193
## Picking joint bandwidth of 0.188
print("Top downregulated genes")
## [1] "Top downregulated genes"
plot_bottom_genes(edger_combat,"logFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0977
## Picking joint bandwidth of 0.108
## Picking joint bandwidth of 0.121
print("lowest abs(LFC) genes")
## [1] "lowest abs(LFC) genes"
plot_low_genes(edger_combat,"absLFC")
## Warning in xtfrm.data.frame(x): cannot xtfrm data frames
## Picking joint bandwidth of 0.0719
## Picking joint bandwidth of 0.0588
## Picking joint bandwidth of 0.0529
plot_upGOs(edger_combat, "logFC")
## [1] "Upregulated GO terms"
plot_downGOs(edger_combat, "logFC")
## [1] "Downregulated GO terms"
High number of DE genes found, the low(logFC) aren’t well distinguishable. GOOD BPs
VlnPlot(dat, features = c("HBA1", "HBA2"), group.by = "bioprojects", split.by = "label", ncol = 2)
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
dat <- subset(dat, subset = bioprojects == "PRJNA694128")
VlnPlot(dat, features = c("HBA1", "HBA2"), group.by = "batch", split.by = "label", ncol = 1)
RRMM_BM_8 has a lower expression of both the subunits… Interestingly is a longitudinal sample and we can look at the expression of the genes along the samples. Also, are those cells misslabelled? or the expression of HBA1/2 is lower in all the cell types. To check it I am loading the original dataset and plot the expression of all the samples from this patient:
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6723903 359.1 21899943 1169.6 27374928 1462.0
## Vcells 33947399 259.0 123000667 938.5 153750833 1173.1
data <- readRDS("MM_atlas.Rds")
data <- subset(data, label %in% c("Late erythroid progenitor", "Aberrant erythroid"))
data <- NormalizeData(data)
VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "label", ncol = 1)
VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "bioprojects", split.by= "label", ncol = 1)
#VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "bioprojects", ncol = 1)
data <- subset(data, subset = bioprojects == "PRJNA694128")
VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "batch", ncol = 1)
data <- subset(data, subset = batch == c("remission_BM_1", "RRMM_BM_1", "RRMM_BM_8", "PMM_CD138p_1"))
VlnPlot(data, features = c("HBA1", "HBA2"), group.by = "batch", ncol = 1)
Interestingly also the expression of HBA1/2 in other cell lines is lower in RR_BM_8, even looking at the other longitudinal data from the same patient